import pandas as pd
import numpy as np
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.plotly as py
from plotly import tools
from datetime import date
import seaborn as sns
import random
import warnings
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
import zipfile
import os
import winsound
import sys
sys.path.append('c:\\users\\chait\\anaconda3\\lib\\site-packages')
"""Helper Functions to do Plotting"""
def generateLayoutBar(col_name):
"""
Generate a layout object for bar chart
"""
layout_bar = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=800, # height of the figure in pixels
height=600, # height of the figure in pixels
title="Distribution of {} column".format(col_name), # title of the figure
# more granular control on the title font
titlefont=dict(
family='Courier New, monospace', # font family
size=14, # size of the font
color='black' # color of the font
),
# granular control on the axes objects
xaxis=dict(
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the x axis
color='black' # color of the font
)
),
yaxis=dict(
# range=[0,100],
title='Percentage',
titlefont=dict(
size=14,
color='black'
),
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the y axis
color='black' # color of the font
)
),
font=dict(
family='Courier New, monospace', # font family
color="white", # color of the font
size=12 # size of the font displayed on the bar
)
)
return layout_bar
def plotBar(dataframe_name, col_name, top_n=None):
"""
Plot a bar chart for the categorical columns
Arguments:
dataframe name
categorical column name
Output:
Plot
"""
# create a table with value counts
temp = dataframe_name[col_name].value_counts()
if top_n is not None:
temp = temp.head(top_n)
# creating a Bar chart object of plotly
data = [go.Bar(
x=temp.index.astype(str), # x axis values
y=np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100, # y axis values
text=['{}%'.format(i) for i in np.round(temp.values.astype(float) / temp.values.sum(), 4) * 100],
# text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
textposition='auto', # specify at which position on the bar the text should appear
marker=dict(color='#0047AB'),)] # change color of the bar
# color used here Cobalt Blue
layout_bar = generateLayoutBar(col_name=col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
outpath = "../data/"
os.listdir(outpath)
# for i in os.listdir(outpath):
# z = zipfile.ZipFile(outpath + i)
# z.extractall(path = outpath)
# z.close()
for i in os.listdir(outpath):
print(i)
datapath = '../data/Data/'
for i in os.listdir(datapath):
print(i)
Customer_Demographics = pd.read_excel(datapath + 'Customer_Demographics.xlsx')
Customer_Transaction = pd.read_excel(datapath + 'Customer_Transaction.xlsx ')
Store_Master = pd.read_excel(datapath + 'Store_Master.xlsx')
Test_Set = pd.read_excel(datapath + 'Test_Set.xlsx')
[i.shape for i in [Customer_Demographics, Customer_Transaction, Store_Master, Test_Set]]
No information about the units given in the data. Assuming Revenue in dhirams.
Customer_Demographics.head()
Customer_Demographics.dtypes
from datetime import datetime
datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y %I:%M%p')
Customer_Demographics.First_txn_dt[0]
t = datetime.strptime('02NOV2010:00:00:00', '%d%b%Y:%H:%M:%S')
date_cols_demo = [ i for i in Customer_Demographics.columns if (i.endswith('dt')) or ('date' in i)]
date_cols_demo
Customer_Demographics.First_txn_dt.dtype
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].astype('O')
def convertToDate(x):
try:
return datetime.strptime(x, '%d%b%Y:%H:%M:%S')
except:
return x
Customer_Demographics[date_cols_demo] = Customer_Demographics[date_cols_demo].applymap(lambda x : convertToDate(x))
date_cols_demo
def extractColTypes(dataset):
"""This functions extracts numeric, categorical , datetime and boolean column types.
Returns 4 lists with respective column types"""
num_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['int64','float64']]
cat_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['object']]
date_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['datetime64[ns]']]
bool_cols_list = [i for i in dataset.columns if dataset[i].dtype in ['bool']]
print ("Numeric Columns:", len(num_cols_list))
print ("Categorical/Character Columns:", len(cat_cols_list))
print ("Date Columns:",len(date_cols_list))
print ("Boolean Columns:",len(bool_cols_list))
return(num_cols_list,cat_cols_list,date_cols_list,bool_cols_list)
demo_num_cols_list,demo_cat_cols_list,demo_date_cols_list,demo_bool_cols_list = extractColTypes(Customer_Demographics)
demo_cat_cols_list
today = convertToDate('24NOV2018:00:00:00')
def getDaysDiff(x):
t = -1*(x - today)
return t.apply(lambda x : x.days)
Customer_Demographics['custSince'] = getDaysDiff(Customer_Demographics.First_txn_dt)
Customer_Demographics['ds_accr']= getDaysDiff(Customer_Demographics.Last_accr_txn_dt)
Customer_Demographics['ds_rdm']= getDaysDiff(Customer_Demographics.Last_rdm_txn_dt)
for i in demo_cat_cols_list[1:]:
plotBar(Customer_Demographics, i,top_n=10)
Some columns here like the Income and Marital status are unspecified, as of now we cannot decide if they will help in the prediction, but we should keep a close watch for these.
demo_num_cols_list
def plotHist(dataframe, col_name):
data = [go.Histogram(x=dataframe[col_name],
marker=dict(
color='#CC0E1D',# Lava (#CC0E1D)
# color = 'rgb(200,0,0)' # you can provide color in HEX format or rgb format, genrally programmers prefer HEX format as it is a single string value and easy to pass as a variable
))]
layout = go.Layout(title = "Histogram of {}".format(col_name))
fig = go.Figure(data= data, layout=layout)
return iplot(fig)
plotHist(Customer_Demographics, 'Age')
Looks like the age group of 30-40 are frequent visitors of the store, also we see some entries above 100, I will remove them
Customer_Demographics.Age = Customer_Demographics.Age.clip_upper(100)
plotHist(Customer_Demographics, 'Age')
plotHist(Customer_Demographics, 'Points')
Most people have 0-2000 points
Customer_Demographics[Customer_Demographics.Points>2000].shape
Just above 1000 customers have more than 2000 points
Let us see if these customers are distinguishable from the rest
for i in demo_cat_cols_list[1:]:
plotBar(Customer_Demographics[Customer_Demographics.Points>2000], i,top_n=10)
The distribution of all the above columns look very similar to those of the unfiltered data. There is no disctinguishing factor as such.
def plotMultiBox(dataframe,col_name, num_col_name):
data = []
for i in dataframe[col_name].unique():
trace = go.Box(y = dataframe[num_col_name][dataframe[col_name] == i],
name = i)
data.append(trace)
layout = go.Layout(title="Boxplot of levels in {} for {} column".format(col_name,num_col_name))
fig = go.Figure(data=data, layout=layout)
return (iplot(fig))
plotMultiBox(Customer_Demographics, 'Loyalty_Status', 'custSince')
We see that in general, gold customers are with the company longer
plotMultiBox(Customer_Demographics, 'Gender', 'custSince')
Females have been customers for a marginally more number of days than males.
plotMultiBox(Customer_Demographics, 'Income_Range', 'custSince')
Observations:
- An interesting observation here, customers who earn more than 20,000 Dhirams have been with the company for over 2500 days, also there are no recent entries for such high income groups.
- The recent entries are from <20000 dhiram earning group. May be the store now has more affordable goods.
- This could also be a result of the location of the malls
Customer_Transaction.columns
Customer_Transaction.dtypes
Customer_Transaction.head(20)
plotHist(Customer_Transaction,'Revenue')
Customer_Transaction.Customer_ID = Customer_Transaction.Customer_ID.astype('O')
tran_num_cols_list, tran_cat_cols_list, tran_date_cols_list, tran_bool_cols_list = extractColTypes(Customer_Transaction)
tran_cat_cols_list.remove('Customer_ID')
for i in tran_cat_cols_list:
plotBar(Customer_Transaction, i,top_n=10)
Observations:
- 7% return rate in the shops
- Most returns are related to Size Problems
- Store type, business and delivery have only one value
Customer_Demographics[['Last_accr_txn_dt', 'Last_rdm_txn_dt']].isnull().sum()
Customer_Demographics['recent_tran_date'] = Customer_Demographics[['Last_accr_txn_dt', 'Last_rdm_txn_dt']].apply(max, axis = 1)
Customer_Demographics[['Last_accr_txn_dt', 'Last_rdm_txn_dt','recent_tran_date']].head()
cust_recent_tran_date = Customer_Demographics[['Customer_ID','recent_tran_date']]
cust_recent_tran_date.isnullll().sum()
# same functions used before
def getRecency(x):
t = -1*(x - today)
return t.apply(lambda x : x.days)
custId_Recency = pd.DataFrame(cust_recent_tran_date['Customer_ID'])
custId_Recency['Recency'] = getDaysDiff(cust_recent_tran_date.recent_tran_date)
custId_Recency.shape
custId_Recency.head()
Considering the data in Customer Transaction table, we have the returns data as well, we will consider return as -1 invoice or -revenue
Customer_Transaction.columns
Customer_Transaction.Transaction_Type.unique()
# Customer_Transaction[Customer_Transaction.Transaction_Type != 'Return'].shape
Customer_Transaction[Customer_Transaction.Transaction_Type == 'Return'].Invoices = -1 * Customer_Transaction[Customer_Transaction.Transaction_Type == 'Return'].Invoices
custId_Frequency = Customer_Transaction.groupby(['Customer_ID']).agg({'Invoices': np.sum}).reset_index()
custId_Frequency.columns = ['Customer_ID', 'Frequency']
custId_Frequency.shape
custId_Frequency.head()
custId_Frequency.Frequency[custId_Frequency.Frequency<0]
custId_Monetary = Customer_Transaction.groupby(['Customer_ID']).agg({'Revenue': np.sum}).reset_index()
custId_Monetary.columns = ['Customer_ID','MonetaryValue']
custId_Monetary.shape
custId_Monetary.head()
custId_RFM = custId_Recency.join(custId_Frequency.set_index('Customer_ID'),on='Customer_ID')
custId_RFM = custId_RFM.join(custId_Monetary.set_index('Customer_ID'),on='Customer_ID',)
custId_RFM.head()
custId_RFM['RecencyRank'] = pd.qcut(custId_RFM.Recency,q = 5, labels = False)
custId_RFM['FrequencyRank'] = pd.qcut(custId_RFM.Frequency,q = 5, labels = False, duplicates = 'drop')
custId_RFM['MonetaryValueRank'] = pd.qcut(custId_RFM.MonetaryValue,q = 5, labels = False , duplicates = 'drop')
custId_RFM.shape
custId_RFM.head()
custId_RFM.isnull().sum()
custId_RFM.dropna(inplace=True)
custId_RFM.RecencyRank.unique()
for i in custId_RFM.columns:
if 'Rank' in i:
print (i)
custId_RFM[i] = custId_RFM[i].apply(lambda x : str(int(np.round(x)+1)))
custId_RFM.RecencyRank.unique()
custId_RFM.head()
custId_RFM['RFMScore'] = custId_RFM.RecencyRank + custId_RFM.FrequencyRank + custId_RFM.MonetaryValueRank
custId_RFM.head()
custId_RFM.dtypes
custId_RFM.to_csv(datapath + "RFM.csv",index = False)
custId_RFM.RFMScore.value_counts()
Segments
- Best Customers : Recent, frequent and high monetary value
- High-spending New Customers : Recent , not so frequent but high spending
- Lowest-Spending Active Loyal Customers : recent and frequent but spend less
- Churned Best Customers : Not recent but frequent and high spending
segments_dict = {'155':'Best_Customers',
'145':'Best_Customers',
'154':'Best_Customers',
'244':'Best_Customers',
'254':'Best_Customers',
'245':'Best_Customers',
'135': 'High_Spending_New_Customers',
'235': 'High_Spending_New_Customers',
'134': 'High_Spending_New_Customers',
'125': 'High_Spending_New_Customers',
'124': 'High_Spending_New_Customers',
'225': 'High_Spending_New_Customers',
'151': 'Lowest_Spending_Active_Loyal_Customers',
'152': 'Lowest_Spending_Active_Loyal_Customers',
'153': 'Lowest_Spending_Active_Loyal_Customers',
'141': 'Lowest_Spending_Active_Loyal_Customers',
'142': 'Lowest_Spending_Active_Loyal_Customers',
'143': 'Lowest_Spending_Active_Loyal_Customers',
'251': 'Lowest_Spending_Active_Loyal_Customers',
'252': 'Lowest_Spending_Active_Loyal_Customers',
'253': 'Lowest_Spending_Active_Loyal_Customers',
'515': 'Curned_Best_Customers',
'514': 'Curned_Best_Customers',
'525': 'Curned_Best_Customers',
'524': 'Curned_Best_Customers',
'415': 'Curned_Best_Customers',
'414': 'Curned_Best_Customers',
}
We can add mote segments, but for this analysis I am limiting the number of segments
custId_RFM['CustomerSegment'] = custId_RFM.RFMScore
custId_RFM['CustomerSegment'] = custId_RFM['CustomerSegment'].replace(segments_dict)
## Best_Customers
Best_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Best_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Best_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Best_Customers'],
mode='markers',name ='Best_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
## High_Spending_New_Customers
High_Spending_New_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='High_Spending_New_Customers'],
mode='markers',name ='High_Spending_New_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
## Lowest_Spending_Active_Loyal_Customers
Lowest_Spending_Active_Loyal_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Lowest_Spending_Active_Loyal_Customers'],
mode='markers',name ='Lowest_Spending_Active_Loyal_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
## Curned_Best_Customers
Curned_Best_Customers = go.Scatter3d(
x=custId_RFM.Recency[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
y=custId_RFM.Frequency[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
z=custId_RFM.MonetaryValue[custId_RFM['CustomerSegment']=='Curned_Best_Customers'],
mode='markers',name ='Curned_Best_Customers',
marker=dict(
size=4,
opacity=0.6
)
)
data = [Best_Customers,High_Spending_New_Customers, Lowest_Spending_Active_Loyal_Customers ,Curned_Best_Customers]
layout = go.Layout(
scene = dict(
xaxis = dict(
title='Recency',
backgroundcolor="black",
showbackground=True,
titlefont=dict(
size=16,
color='black'
)
),
yaxis = dict(
title='Frequency',
showbackground=True,
backgroundcolor="black",
titlefont=dict(
size=16,
color='black'
)
),
zaxis = dict(
title='MonetaryValue',
backgroundcolor="black",
showbackground=True,
titlefont=dict(
size=16,
color='black'
)
)
),
width=1000, # height of the figure in pixels
height=800, # height of the figure in pixels
margin = dict( b =15),)
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(title= "RFM Customer Segmentation")
iplot(fig)
Observations:
- Churned best customers had a maximum monetary value of 6000 dhirams. did not shop in the last 1.5 year.
- Best customers have shopped in the last 3 months and on an average shopped about 50 times.
- High spending new customers have a maximum frequency of 9.
- Lowest spending active loyal customers have visited less than 10 times in the last 2.5 months and spent a maximum of 500 dhirams.
Thank you for reading!